In [1]:
from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))

from IPython.display import IFrame
In [2]:
import pandas as pd
import numpy as np
# from plotly.offline import init_notebook_mode, iplot
# import cufflinks as cf
# init_notebook_mode()
# cf.go_offline()
from __future__ import division
In [3]:
import xgboost as xgb
In [4]:
import sys
sys.path.insert(0,'../')
from utils.paths import *
In [7]:
print path_SBA
!aws s3 ls --human-readable s3://eh-home/ehda-calvin/SBA_study/
s3://eh-home/ehda-calvin/SBA_study/
                           PRE pdf/
2018-10-29 08:48:12   15.8 KiB 7a_504_FOIA Data Dictionary.xlsx
2018-10-29 08:48:13   28.9 MiB FOIA - 504 (FY1991-Present).xlsx
2018-10-29 08:48:15   52.6 MiB FOIA - 7(a)(FY1991-FY1999).xlsx
2018-10-29 08:48:17  111.1 MiB FOIA - 7(a)(FY2000-FY2009).xlsx
2018-10-29 08:48:19   80.8 MiB FOIA - 7(a)(FY2010-Present).xlsx
2018-10-26 03:13:21  171.1 MiB SBAnational.csv
2018-11-01 07:21:49  232.1 MiB SBAnational_new.csv
2018-11-01 07:11:23   10.4 MiB company_default_record.csv
2018-11-01 07:03:29   67.9 MiB extra_company_info.csv
2018-10-29 03:22:11    9.6 MiB loan_record.csv
2018-10-29 02:42:40    1.4 KiB t0001-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:41  772 Bytes t0002-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:42  785 Bytes t0003-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:43  913 Bytes t0004-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:43  202 Bytes t0005-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:44  203 Bytes t0006-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:45  289 Bytes t0007-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:46  408 Bytes t0008-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:47  204 Bytes t0009-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:48  294 Bytes t0010-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:49  250 Bytes t0011-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:50  316 Bytes t0012-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:50    2.3 KiB t0013-10.1080%2F10691898.2018.1434342.csv
Variable Name LoanNr_ChkDgt: Identifier – Primary Key Name: Borrower Name City: Borrower City State: Borrower State Zip: Borrower Zip Code Bank: Bank Name BankState: Bank State NAICS: North American Industry Classification System code ApprovalDate: Date SBA Commitment Issued ApprovalFY: Fiscal Year of Commitment Term: Loan term in months NoEmp: Number of Business Employees NewExist: 1 = Existing Business, 2 = New Business CreateJob: Number of jobs created RetainedJob: Number of jobs retained FranchiseCode: Franchise Code 00000 or 00001 = No Franchise UrbanRural: 1= Urban, 2= Rural, 0 = Undefined RevLineCr: Revolving Line of Credit : Y = Yes LowDoc: LowDoc Loan Program: Y = Yes, N = No ChgOffDate: The date when a loan is declared to be in default DisbursementDate: Disbursement Date DisbursementGross: Amount Disbursed BalanceGross: Gross amount outstanding MIS_Status: Loan Status ChgOffPrinGr: Charged-off Amount GrAppv: Gross Amount of Loan Approved by Bank SBA_Appv: SBA’s Guaranteed Amount of Approved Loan

Preprocessing

In [88]:
# Preprocessing
import preprocessing as pp
reload(pp)
Out[88]:
<module 'preprocessing' from '../preprocessing.pyc'>
In [89]:
nat = pd.read_csv(path_SBA + 'SBAnational_new.csv', sep = ';', low_memory=False)
In [93]:
# Add job related features

nat['Expanding'] = nat.CreateJob.apply(pp.expanding)
nat['Retaining'] = nat.CreateJob.apply(pp.retaining)
nat['Expanding_ratio'] = nat.apply(lambda x: pp.expanding_ratio(x['CreateJob'], x['NoEmp']), axis= 1).value_counts()
nat['Retaining_ratio'] = nat.apply(lambda x: retaining_ratio(x['RetainedJob'], x['NoEmp']), axis= 1).value_counts()
In [94]:
use_col = ['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Bank', 'BankState', 
           'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 
           'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr', 
           'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross', 
           'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 
           'default', 'Zip5d', 'Zip3d', 'SBA_ratio', 'RealEstate', 
           'NAICS_default_rate', 'NAICS_group', 'suffix', 'Loan_age', 'Previous_loan', 'default_times',
           'Expanding', 'Retaining', 'Expanding_ratio', 'Retaining_ratio'
          ]
In [95]:
print nat.shape
nat = nat[use_col]
print nat.shape
# nat[use_col].head().T
(897137, 40)
(897137, 40)

Portfolios

In [96]:
# Train & test data in 2003 and 2004, predict in 2005

nat34 = nat[nat.ApprovalFY.isin([2003, 2004])].reset_index(drop = True)
nat5 = nat[nat.ApprovalFY.isin([2005])].reset_index(drop = True)

print nat34.shape, nat5.shape
(126195, 40) (76957, 40)
In [97]:
def extract_train_features(features, drop, categorical):
    print('-----> Extract train features <------')
    print 'dropping unwanted columns'
    features = features.drop(drop, axis=1)

    print 'transforming categorical variables'
    dict_categorical = {}
    for col in categorical:
        cat = pd.Categorical(features[col])
        new_col = col[:-1]+'INT'
        if col[-2] == '_':
            new_col = col[:-1]+'INT'
        else:
            new_col = col+'_INT'
        features.loc[:, new_col] = cat.codes
        dict_categorical[col] = dict([(k, v) for v, k in enumerate(cat.categories)])
    features = features.drop(categorical, axis=1)
    print 'done'
    return dict_categorical, features

def extract_test_features(test, drop, categorical, dict_categorical):
    print('-----> Extract test features <------')
    print 'dropping unwanted columns'
    test=test.drop(drop, axis=1)
    print 'transforming categorical variabless'
    for col in categorical:
        new_col = col[:-1]+'INT'
        if col[-2] == '_':
            new_col = col[:-1]+'INT'
        else:
            new_col = col+'_INT'
        test[new_col] = test[col].map(dict_categorical[col])
        test[new_col].fillna(-1, inplace=True)
    test=test.drop(categorical, axis=1)
    print 'done'
    return test
In [98]:
nat34.head()
Out[98]:
LoanNr_ChkDgt Name City State Bank BankState NAICS ApprovalDate ApprovalFY Term ... NAICS_default_rate NAICS_group suffix Loan_age Previous_loan default_times Expanding Retaining Expanding_ratio Retaining_ratio
0 1005255000 Clark's Inc. VISTA CA CALIFORNIA BANK & TRUST CA 442210 2002-11-06 2003 2 ... 22.0 44 INC 0 0 0 0 0 NaN NaN
1 1005265003 James Byung Ho Park dba Parago DOWNEY CA CALIFORNIA BANK & TRUST CA 422990 2002-11-06 2003 83 ... 19.0 42 NO SUFFIX 0 0 0 0 0 NaN NaN
2 1005275006 Danny W. Temple and Beau M. Te MONTE VISTA CO CALIFORNIA BANK & TRUST CA 561730 2002-11-06 2003 36 ... 24.0 56 NO SUFFIX 0 0 0 0 0 NaN NaN
3 1005285009 J & V Enterprises, LLC COLORADO SPRINGS CO CALIFORNIA BANK & TRUST CA 722211 2002-12-10 2003 36 ... 22.0 72 LLC 0 0 0 0 0 NaN NaN
4 1005295001 Susana Chung dba Law Offices o LOS ANGELES CA CALIFORNIA BANK & TRUST CA 541110 2002-12-10 2003 36 ... 19.0 54 NO SUFFIX 0 0 0 0 0 NaN NaN

5 rows × 40 columns

Train, Test split

In [99]:
from sklearn import model_selection
Train, Test = model_selection.train_test_split(nat34, 
                                               test_size = 0.25, 
                                               random_state = 1868,
                                               stratify = nat34.default 
                                               )

print Train.shape, Test.shape
print Train.default.sum(), Test.default.sum()
print Train.default.sum()/Train.shape[0], Test.default.sum()/Test.shape[0]
(94646, 40) (31549, 40)
15548 5183
0.16427529953722292 0.16428412944942788
In [100]:
Train.head().T
Out[100]:
94281 80222 120325 112496 111901
LoanNr_ChkDgt 7315064009 7097824002 7715604004 7577414007 7567144005
Name COMPULANDZ KOI DESIGN STUDIO INC SALT PRODUCTIONS LTD BRAMAAN ENTERPRISES LLC J G M & ASSOCIATES INC
City GREENVILLE LOS ANGELES NEW YORK HAGUE LEWES
State RI CA NY NY DE
Bank CITIZENS BANK NATL ASSOC CALIFORNIA BANK & TRUST FIRST NIAGARA BANK NATL ASSOC FIRST NIAGARA BANK NATL ASSOC CITIZENS BANK NATL ASSOC
BankState RI CA NY NY RI
NAICS 443120 561611 711410 722110 238320
ApprovalDate 2004-03-31 2004-01-15 2004-08-20 2004-07-01 2004-06-29
ApprovalFY 2004 2004 2004 2004 2004
Term 84 84 84 84 84
NoEmp 1 4 1 72 18
NewExist 1 1 1 1 1
CreateJob 0 0 3 0 0
RetainedJob 1 4 4 72 18
FranchiseCode 0 0 0 0 0
UrbanRural 1 1 1 2 2
RevLineCr 1 1 1 0 1
LowDoc 0 0 0 0 0
ChgOffDate NaN NaN NaN NaN NaN
DisbursementDate NaN 2004-05-31 2004-09-30 2005-10-31 2004-08-31
DisbursementGross 5000 95000 70000 15000 39200
BalanceGross 0 0 0 0 0
ChgOffPrinGr 0 0 0 0 0
GrAppv 5000 95000 50000 60000 20000
SBA_Appv 2500 47500 25000 30000 10000
default 0 0 0 0 0
Zip5d 2828 90033 10014 12836 19958
Zip3d 28 900 100 128 199
SBA_ratio 0.5 0.5 0.5 0.5 0.5
RealEstate 0 0 0 0 0
NAICS_default_rate 22 24 21 22 23
NAICS_group 44 56 71 72 23
suffix NO SUFFIX INC LTD LLC INC
Loan_age 0 0 0 0 0
Previous_loan 0 0 0 0 0
default_times 0 0 0 0 0
Expanding 0 0 1 0 0
Retaining 0 0 1 0 0
Expanding_ratio NaN NaN NaN NaN NaN
Retaining_ratio NaN NaN NaN NaN NaN
In [101]:
print Train.columns.tolist()
['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Bank', 'BankState', 'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'default', 'Zip5d', 'Zip3d', 'SBA_ratio', 'RealEstate', 'NAICS_default_rate', 'NAICS_group', 'suffix', 'Loan_age', 'Previous_loan', 'default_times', 'Expanding', 'Retaining', 'Expanding_ratio', 'Retaining_ratio']
In [102]:
# Preprocessing train set
features = Train
target = Train.default

drop = ['LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'ApprovalFY', 'ChgOffDate', 'DisbursementDate',
        'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'SBA_ratio',
        'default', 'FranchiseCode', 'Term', 'NAICS']

categorical = ['City', 'State', 'Zip5d', 'Zip3d', 'Bank', 'BankState', 'RevLineCr', 
               'LowDoc', 'NAICS_group', 'suffix',
               'Expanding_ratio', 'Retaining_ratio'
              ]

dict_categorical, features = extract_train_features(features,
                                                    drop,
                                                    categorical)

print features.shape
print target.sum()
-----> Extract train features <------
dropping unwanted columns
transforming categorical variables
done
(94646, 24)
15548
In [103]:
features.head()
Out[103]:
NoEmp NewExist CreateJob RetainedJob UrbanRural RealEstate NAICS_default_rate Loan_age Previous_loan default_times ... Zip5d_INT Zip3d_INT Bank_INT BankState_INT RevLineCr_INT LowDoc_INT NAICS_group_INT suffix_INT Expanding_ratio_INT Retaining_ratio_INT
94281 1 1.0 0 1 1 0 22.0 0 0 0 ... 502 18 523 40 1 0 7 20 -1 -1
80222 4 1.0 0 4 1 0 24.0 0 0 0 ... 14104 820 389 4 1 0 14 13 -1 -1
120325 1 1.0 3 4 1 0 21.0 0 0 0 ... 1803 80 1027 35 1 0 17 18 -1 -1
112496 72 1.0 0 72 2 0 22.0 0 0 0 ... 2370 108 1027 35 0 0 18 16 -1 -1
111901 18 1.0 0 18 2 0 23.0 0 0 0 ... 3718 179 523 40 1 0 4 13 -1 -1

5 rows × 24 columns

In [104]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(features, 
                                                                    target, 
                                                                    test_size = 0.25, 
                                                                    random_state=3776,
                                                                    stratify=target 
                                                                   )


dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
dtest = xgb.DMatrix(X_test.values, y_test.values)
num_rounds = 1100
# num_rounds = 2000

params = {'silent':1, 
          'eta':0.01, 
          'max_depth':10, 
          'subsample': 0.7, 
          'colsample_bytree': 0.6,
          'min_child_weight':1, 
          'objective':'binary:logistic', 
          'eval_metric':'auc', 
          'seed':2017, 
          'gamma':0.1,
          'nthread':-1}

watchlist = [(dtrain, 'train'),(dtest,'validation')]
bst=xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds = 50, verbose_eval = True);
num_rounds = bst.best_iteration
print num_rounds
[0]	train-auc:0.750981	validation-auc:0.72794
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 50 rounds.
[1]	train-auc:0.763702	validation-auc:0.735342
[2]	train-auc:0.767296	validation-auc:0.745522
[3]	train-auc:0.781004	validation-auc:0.756028
[4]	train-auc:0.781881	validation-auc:0.751226
[5]	train-auc:0.780398	validation-auc:0.7514
[6]	train-auc:0.780711	validation-auc:0.752348
[7]	train-auc:0.783393	validation-auc:0.755415
[8]	train-auc:0.781877	validation-auc:0.754357
[9]	train-auc:0.784498	validation-auc:0.757057
[10]	train-auc:0.783445	validation-auc:0.756525
[11]	train-auc:0.783923	validation-auc:0.755892
[12]	train-auc:0.784974	validation-auc:0.756372
[13]	train-auc:0.78761	validation-auc:0.75719
[14]	train-auc:0.789968	validation-auc:0.758387
[15]	train-auc:0.790721	validation-auc:0.759501
[16]	train-auc:0.790945	validation-auc:0.759681
[17]	train-auc:0.793626	validation-auc:0.761161
[18]	train-auc:0.793109	validation-auc:0.760172
[19]	train-auc:0.794782	validation-auc:0.760946
[20]	train-auc:0.794293	validation-auc:0.760428
[21]	train-auc:0.795085	validation-auc:0.760978
[22]	train-auc:0.794292	validation-auc:0.76071
[23]	train-auc:0.795618	validation-auc:0.760734
[24]	train-auc:0.79827	validation-auc:0.762611
[25]	train-auc:0.800558	validation-auc:0.764853
[26]	train-auc:0.800364	validation-auc:0.765147
[27]	train-auc:0.801009	validation-auc:0.764837
[28]	train-auc:0.801249	validation-auc:0.764838
[29]	train-auc:0.801358	validation-auc:0.764654
[30]	train-auc:0.800753	validation-auc:0.764596
[31]	train-auc:0.800643	validation-auc:0.764278
[32]	train-auc:0.800847	validation-auc:0.76439
[33]	train-auc:0.802556	validation-auc:0.765654
[34]	train-auc:0.802671	validation-auc:0.766076
[35]	train-auc:0.802308	validation-auc:0.766341
[36]	train-auc:0.803779	validation-auc:0.767008
[37]	train-auc:0.803704	validation-auc:0.766465
[38]	train-auc:0.804635	validation-auc:0.766112
[39]	train-auc:0.805185	validation-auc:0.76615
[40]	train-auc:0.804656	validation-auc:0.766226
[41]	train-auc:0.804455	validation-auc:0.766339
[42]	train-auc:0.805387	validation-auc:0.766997
[43]	train-auc:0.80714	validation-auc:0.767479
[44]	train-auc:0.808309	validation-auc:0.767881
[45]	train-auc:0.808465	validation-auc:0.767831
[46]	train-auc:0.807657	validation-auc:0.767645
[47]	train-auc:0.80748	validation-auc:0.767872
[48]	train-auc:0.807766	validation-auc:0.767855
[49]	train-auc:0.807874	validation-auc:0.767578
[50]	train-auc:0.808879	validation-auc:0.76774
[51]	train-auc:0.809815	validation-auc:0.768315
[52]	train-auc:0.810962	validation-auc:0.769291
[53]	train-auc:0.812242	validation-auc:0.770065
[54]	train-auc:0.811935	validation-auc:0.769871
[55]	train-auc:0.812059	validation-auc:0.770118
[56]	train-auc:0.812229	validation-auc:0.770087
[57]	train-auc:0.812348	validation-auc:0.77036
[58]	train-auc:0.813625	validation-auc:0.770884
[59]	train-auc:0.813538	validation-auc:0.770851
[60]	train-auc:0.813835	validation-auc:0.771244
[61]	train-auc:0.813907	validation-auc:0.771296
[62]	train-auc:0.814574	validation-auc:0.77195
[63]	train-auc:0.81507	validation-auc:0.772012
[64]	train-auc:0.815519	validation-auc:0.771772
[65]	train-auc:0.816085	validation-auc:0.771654
[66]	train-auc:0.815839	validation-auc:0.771631
[67]	train-auc:0.816587	validation-auc:0.771764
[68]	train-auc:0.817212	validation-auc:0.772247
[69]	train-auc:0.817167	validation-auc:0.772453
[70]	train-auc:0.817621	validation-auc:0.772774
[71]	train-auc:0.818041	validation-auc:0.772885
[72]	train-auc:0.818037	validation-auc:0.77299
[73]	train-auc:0.817895	validation-auc:0.772916
[74]	train-auc:0.817805	validation-auc:0.772852
[75]	train-auc:0.818447	validation-auc:0.773344
[76]	train-auc:0.81895	validation-auc:0.773306
[77]	train-auc:0.819085	validation-auc:0.77335
[78]	train-auc:0.819595	validation-auc:0.773693
[79]	train-auc:0.820117	validation-auc:0.773652
[80]	train-auc:0.820946	validation-auc:0.774096
[81]	train-auc:0.821253	validation-auc:0.774522
[82]	train-auc:0.821446	validation-auc:0.774721
[83]	train-auc:0.821457	validation-auc:0.774845
[84]	train-auc:0.821449	validation-auc:0.774848
[85]	train-auc:0.82206	validation-auc:0.775022
[86]	train-auc:0.822056	validation-auc:0.774961
[87]	train-auc:0.82254	validation-auc:0.775389
[88]	train-auc:0.823036	validation-auc:0.775796
[89]	train-auc:0.822877	validation-auc:0.775634
[90]	train-auc:0.823321	validation-auc:0.775591
[91]	train-auc:0.823256	validation-auc:0.775697
[92]	train-auc:0.82329	validation-auc:0.775585
[93]	train-auc:0.823474	validation-auc:0.775537
[94]	train-auc:0.824226	validation-auc:0.775788
[95]	train-auc:0.824387	validation-auc:0.775993
[96]	train-auc:0.82456	validation-auc:0.775931
[97]	train-auc:0.824561	validation-auc:0.775813
[98]	train-auc:0.82444	validation-auc:0.775711
[99]	train-auc:0.824793	validation-auc:0.775652
[100]	train-auc:0.825267	validation-auc:0.775855
[101]	train-auc:0.825203	validation-auc:0.775875
[102]	train-auc:0.825607	validation-auc:0.775804
[103]	train-auc:0.826213	validation-auc:0.775948
[104]	train-auc:0.826384	validation-auc:0.775985
[105]	train-auc:0.82654	validation-auc:0.776197
[106]	train-auc:0.826689	validation-auc:0.776192
[107]	train-auc:0.826868	validation-auc:0.776344
[108]	train-auc:0.826968	validation-auc:0.776199
[109]	train-auc:0.826923	validation-auc:0.776167
[110]	train-auc:0.827205	validation-auc:0.776416
[111]	train-auc:0.827362	validation-auc:0.776628
[112]	train-auc:0.827712	validation-auc:0.776711
[113]	train-auc:0.827946	validation-auc:0.776893
[114]	train-auc:0.828373	validation-auc:0.777163
[115]	train-auc:0.829028	validation-auc:0.77748
[116]	train-auc:0.829744	validation-auc:0.777853
[117]	train-auc:0.829948	validation-auc:0.777938
[118]	train-auc:0.83069	validation-auc:0.778209
[119]	train-auc:0.831226	validation-auc:0.778464
[120]	train-auc:0.83132	validation-auc:0.778598
[121]	train-auc:0.832086	validation-auc:0.778797
[122]	train-auc:0.832333	validation-auc:0.778861
[123]	train-auc:0.832793	validation-auc:0.778823
[124]	train-auc:0.832973	validation-auc:0.778985
[125]	train-auc:0.833081	validation-auc:0.779144
[126]	train-auc:0.833226	validation-auc:0.779239
[127]	train-auc:0.833683	validation-auc:0.779375
[128]	train-auc:0.834362	validation-auc:0.779477
[129]	train-auc:0.834661	validation-auc:0.779756
[130]	train-auc:0.834875	validation-auc:0.779879
[131]	train-auc:0.835179	validation-auc:0.779903
[132]	train-auc:0.835232	validation-auc:0.779887
[133]	train-auc:0.835266	validation-auc:0.779938
[134]	train-auc:0.835541	validation-auc:0.779984
[135]	train-auc:0.835644	validation-auc:0.780059
[136]	train-auc:0.83574	validation-auc:0.780291
[137]	train-auc:0.835595	validation-auc:0.780216
[138]	train-auc:0.835807	validation-auc:0.780367
[139]	train-auc:0.835701	validation-auc:0.78026
[140]	train-auc:0.835751	validation-auc:0.780226
[141]	train-auc:0.836083	validation-auc:0.780476
[142]	train-auc:0.836123	validation-auc:0.780559
[143]	train-auc:0.836213	validation-auc:0.780744
[144]	train-auc:0.836492	validation-auc:0.780876
[145]	train-auc:0.836843	validation-auc:0.781097
[146]	train-auc:0.836934	validation-auc:0.781018
[147]	train-auc:0.837111	validation-auc:0.781108
[148]	train-auc:0.837334	validation-auc:0.781158
[149]	train-auc:0.837687	validation-auc:0.781256
[150]	train-auc:0.83775	validation-auc:0.781267
[151]	train-auc:0.837866	validation-auc:0.781399
[152]	train-auc:0.838273	validation-auc:0.781316
[153]	train-auc:0.838679	validation-auc:0.781363
[154]	train-auc:0.839195	validation-auc:0.781542
[155]	train-auc:0.839488	validation-auc:0.781665
[156]	train-auc:0.839875	validation-auc:0.781659
[157]	train-auc:0.840083	validation-auc:0.781661
[158]	train-auc:0.840406	validation-auc:0.781717
[159]	train-auc:0.840607	validation-auc:0.781897
[160]	train-auc:0.840781	validation-auc:0.781916
[161]	train-auc:0.840669	validation-auc:0.781841
[162]	train-auc:0.840857	validation-auc:0.781901
[163]	train-auc:0.840903	validation-auc:0.782053
[164]	train-auc:0.841219	validation-auc:0.782005
[165]	train-auc:0.841354	validation-auc:0.782097
[166]	train-auc:0.841537	validation-auc:0.782337
[167]	train-auc:0.841696	validation-auc:0.782401
[168]	train-auc:0.841769	validation-auc:0.78245
[169]	train-auc:0.842112	validation-auc:0.782496
[170]	train-auc:0.842197	validation-auc:0.782432
[171]	train-auc:0.842473	validation-auc:0.782391
[172]	train-auc:0.842691	validation-auc:0.782446
[173]	train-auc:0.843052	validation-auc:0.782633
[174]	train-auc:0.843239	validation-auc:0.782853
[175]	train-auc:0.843204	validation-auc:0.782843
[176]	train-auc:0.843298	validation-auc:0.782868
[177]	train-auc:0.843579	validation-auc:0.782897
[178]	train-auc:0.843729	validation-auc:0.782912
[179]	train-auc:0.84383	validation-auc:0.783021
[180]	train-auc:0.843972	validation-auc:0.782985
[181]	train-auc:0.843898	validation-auc:0.782995
[182]	train-auc:0.844252	validation-auc:0.783
[183]	train-auc:0.84437	validation-auc:0.782986
[184]	train-auc:0.844825	validation-auc:0.783142
[185]	train-auc:0.844989	validation-auc:0.783277
[186]	train-auc:0.844995	validation-auc:0.783264
[187]	train-auc:0.845163	validation-auc:0.783196
[188]	train-auc:0.845307	validation-auc:0.783294
[189]	train-auc:0.84542	validation-auc:0.783233
[190]	train-auc:0.845551	validation-auc:0.783246
[191]	train-auc:0.845802	validation-auc:0.783493
[192]	train-auc:0.845985	validation-auc:0.783577
[193]	train-auc:0.84624	validation-auc:0.783712
[194]	train-auc:0.846288	validation-auc:0.783683
[195]	train-auc:0.846482	validation-auc:0.78374
[196]	train-auc:0.846781	validation-auc:0.783731
[197]	train-auc:0.846844	validation-auc:0.783802
[198]	train-auc:0.847052	validation-auc:0.783789
[199]	train-auc:0.847459	validation-auc:0.783872
[200]	train-auc:0.847456	validation-auc:0.783842
[201]	train-auc:0.84773	validation-auc:0.783991
[202]	train-auc:0.847852	validation-auc:0.784096
[203]	train-auc:0.848234	validation-auc:0.784128
[204]	train-auc:0.848539	validation-auc:0.784129
[205]	train-auc:0.848719	validation-auc:0.784223
[206]	train-auc:0.848833	validation-auc:0.784232
[207]	train-auc:0.848939	validation-auc:0.784239
[208]	train-auc:0.849175	validation-auc:0.784242
[209]	train-auc:0.849326	validation-auc:0.784312
[210]	train-auc:0.849356	validation-auc:0.784318
[211]	train-auc:0.849491	validation-auc:0.784487
[212]	train-auc:0.849671	validation-auc:0.784478
[213]	train-auc:0.849861	validation-auc:0.784551
[214]	train-auc:0.85002	validation-auc:0.784721
[215]	train-auc:0.850119	validation-auc:0.7848
[216]	train-auc:0.850218	validation-auc:0.784799
[217]	train-auc:0.850329	validation-auc:0.784758
[218]	train-auc:0.850578	validation-auc:0.784744
[219]	train-auc:0.850736	validation-auc:0.784831
[220]	train-auc:0.850776	validation-auc:0.784837
[221]	train-auc:0.850887	validation-auc:0.784855
[222]	train-auc:0.851081	validation-auc:0.784856
[223]	train-auc:0.851321	validation-auc:0.78498
[224]	train-auc:0.851532	validation-auc:0.784922
[225]	train-auc:0.851732	validation-auc:0.785002
[226]	train-auc:0.85189	validation-auc:0.785088
[227]	train-auc:0.851968	validation-auc:0.785096
[228]	train-auc:0.852115	validation-auc:0.785139
[229]	train-auc:0.852349	validation-auc:0.785186
[230]	train-auc:0.852486	validation-auc:0.785246
[231]	train-auc:0.852792	validation-auc:0.785318
[232]	train-auc:0.853037	validation-auc:0.785479
[233]	train-auc:0.853372	validation-auc:0.785473
[234]	train-auc:0.853516	validation-auc:0.785566
[235]	train-auc:0.853626	validation-auc:0.785599
[236]	train-auc:0.85406	validation-auc:0.785622
[237]	train-auc:0.854193	validation-auc:0.785666
[238]	train-auc:0.854344	validation-auc:0.785729
[239]	train-auc:0.854657	validation-auc:0.785797
[240]	train-auc:0.854644	validation-auc:0.785764
[241]	train-auc:0.854994	validation-auc:0.785753
[242]	train-auc:0.855197	validation-auc:0.785736
[243]	train-auc:0.855366	validation-auc:0.78573
[244]	train-auc:0.855608	validation-auc:0.7858
[245]	train-auc:0.855721	validation-auc:0.785827
[246]	train-auc:0.855794	validation-auc:0.785772
[247]	train-auc:0.855909	validation-auc:0.785819
[248]	train-auc:0.855916	validation-auc:0.785793
[249]	train-auc:0.856015	validation-auc:0.785821
[250]	train-auc:0.856021	validation-auc:0.785827
[251]	train-auc:0.856286	validation-auc:0.785943
[252]	train-auc:0.856295	validation-auc:0.78593
[253]	train-auc:0.856543	validation-auc:0.785971
[254]	train-auc:0.85659	validation-auc:0.786032
[255]	train-auc:0.856693	validation-auc:0.786032
[256]	train-auc:0.857023	validation-auc:0.786023
[257]	train-auc:0.857114	validation-auc:0.786067
[258]	train-auc:0.857288	validation-auc:0.786204
[259]	train-auc:0.857573	validation-auc:0.786203
[260]	train-auc:0.85763	validation-auc:0.786237
[261]	train-auc:0.857709	validation-auc:0.786298
[262]	train-auc:0.857805	validation-auc:0.78634
[263]	train-auc:0.858038	validation-auc:0.78642
[264]	train-auc:0.858128	validation-auc:0.786403
[265]	train-auc:0.858347	validation-auc:0.786488
[266]	train-auc:0.858486	validation-auc:0.786531
[267]	train-auc:0.858777	validation-auc:0.786565
[268]	train-auc:0.859024	validation-auc:0.78655
[269]	train-auc:0.85916	validation-auc:0.786588
[270]	train-auc:0.859311	validation-auc:0.786667
[271]	train-auc:0.859395	validation-auc:0.786738
[272]	train-auc:0.859407	validation-auc:0.786797
[273]	train-auc:0.859707	validation-auc:0.786834
[274]	train-auc:0.859751	validation-auc:0.786842
[275]	train-auc:0.859872	validation-auc:0.786888
[276]	train-auc:0.859955	validation-auc:0.7869
[277]	train-auc:0.860104	validation-auc:0.786938
[278]	train-auc:0.860399	validation-auc:0.78696
[279]	train-auc:0.860606	validation-auc:0.787001
[280]	train-auc:0.860708	validation-auc:0.787105
[281]	train-auc:0.860839	validation-auc:0.78709
[282]	train-auc:0.860847	validation-auc:0.787155
[283]	train-auc:0.86109	validation-auc:0.787159
[284]	train-auc:0.861196	validation-auc:0.787158
[285]	train-auc:0.861278	validation-auc:0.787219
[286]	train-auc:0.861433	validation-auc:0.787205
[287]	train-auc:0.86159	validation-auc:0.787189
[288]	train-auc:0.861634	validation-auc:0.787168
[289]	train-auc:0.861721	validation-auc:0.787225
[290]	train-auc:0.861797	validation-auc:0.787279
[291]	train-auc:0.861889	validation-auc:0.787332
[292]	train-auc:0.862033	validation-auc:0.787361
[293]	train-auc:0.862129	validation-auc:0.787381
[294]	train-auc:0.862384	validation-auc:0.787419
[295]	train-auc:0.862592	validation-auc:0.787426
[296]	train-auc:0.862766	validation-auc:0.787436
[297]	train-auc:0.862892	validation-auc:0.787456
[298]	train-auc:0.862987	validation-auc:0.787468
[299]	train-auc:0.863006	validation-auc:0.787483
[300]	train-auc:0.863101	validation-auc:0.787495
[301]	train-auc:0.863364	validation-auc:0.787495
[302]	train-auc:0.863719	validation-auc:0.787583
[303]	train-auc:0.863868	validation-auc:0.787639
[304]	train-auc:0.863936	validation-auc:0.787676
[305]	train-auc:0.864117	validation-auc:0.787659
[306]	train-auc:0.864203	validation-auc:0.787695
[307]	train-auc:0.864432	validation-auc:0.787732
[308]	train-auc:0.864511	validation-auc:0.787742
[309]	train-auc:0.864519	validation-auc:0.787791
[310]	train-auc:0.864565	validation-auc:0.787832
[311]	train-auc:0.86466	validation-auc:0.787844
[312]	train-auc:0.864671	validation-auc:0.787841
[313]	train-auc:0.864956	validation-auc:0.787848
[314]	train-auc:0.865177	validation-auc:0.787846
[315]	train-auc:0.865354	validation-auc:0.787881
[316]	train-auc:0.86553	validation-auc:0.787875
[317]	train-auc:0.865589	validation-auc:0.787912
[318]	train-auc:0.86573	validation-auc:0.787921
[319]	train-auc:0.865818	validation-auc:0.787919
[320]	train-auc:0.86595	validation-auc:0.787988
[321]	train-auc:0.86599	validation-auc:0.788037
[322]	train-auc:0.866237	validation-auc:0.788073
[323]	train-auc:0.866442	validation-auc:0.788076
[324]	train-auc:0.866623	validation-auc:0.788097
[325]	train-auc:0.866758	validation-auc:0.788124
[326]	train-auc:0.866891	validation-auc:0.788151
[327]	train-auc:0.866948	validation-auc:0.788171
[328]	train-auc:0.867223	validation-auc:0.78819
[329]	train-auc:0.867482	validation-auc:0.788179
[330]	train-auc:0.867544	validation-auc:0.788173
[331]	train-auc:0.867663	validation-auc:0.788209
[332]	train-auc:0.867934	validation-auc:0.788251
[333]	train-auc:0.867941	validation-auc:0.788279
[334]	train-auc:0.868109	validation-auc:0.788227
[335]	train-auc:0.868264	validation-auc:0.788204
[336]	train-auc:0.868312	validation-auc:0.788224
[337]	train-auc:0.868499	validation-auc:0.788148
[338]	train-auc:0.868567	validation-auc:0.788135
[339]	train-auc:0.868649	validation-auc:0.788167
[340]	train-auc:0.868784	validation-auc:0.788147
[341]	train-auc:0.868831	validation-auc:0.788165
[342]	train-auc:0.86892	validation-auc:0.788144
[343]	train-auc:0.868902	validation-auc:0.788162
[344]	train-auc:0.86905	validation-auc:0.788176
[345]	train-auc:0.869148	validation-auc:0.788205
[346]	train-auc:0.869297	validation-auc:0.788181
[347]	train-auc:0.869416	validation-auc:0.788199
[348]	train-auc:0.869578	validation-auc:0.788261
[349]	train-auc:0.869669	validation-auc:0.788313
[350]	train-auc:0.869775	validation-auc:0.788322
[351]	train-auc:0.870044	validation-auc:0.788309
[352]	train-auc:0.870142	validation-auc:0.788301
[353]	train-auc:0.87024	validation-auc:0.788334
[354]	train-auc:0.870318	validation-auc:0.788329
[355]	train-auc:0.870345	validation-auc:0.788349
[356]	train-auc:0.870355	validation-auc:0.788352
[357]	train-auc:0.8704	validation-auc:0.788353
[358]	train-auc:0.870543	validation-auc:0.788344
[359]	train-auc:0.870684	validation-auc:0.788347
[360]	train-auc:0.87085	validation-auc:0.788379
[361]	train-auc:0.870877	validation-auc:0.788412
[362]	train-auc:0.871002	validation-auc:0.788443
[363]	train-auc:0.871183	validation-auc:0.788477
[364]	train-auc:0.871337	validation-auc:0.788509
[365]	train-auc:0.87145	validation-auc:0.788552
[366]	train-auc:0.871626	validation-auc:0.788535
[367]	train-auc:0.871644	validation-auc:0.788569
[368]	train-auc:0.871819	validation-auc:0.788545
[369]	train-auc:0.871998	validation-auc:0.788599
[370]	train-auc:0.872013	validation-auc:0.788639
[371]	train-auc:0.872089	validation-auc:0.788664
[372]	train-auc:0.872179	validation-auc:0.788712
[373]	train-auc:0.872249	validation-auc:0.78871
[374]	train-auc:0.872289	validation-auc:0.788708
[375]	train-auc:0.872383	validation-auc:0.78871
[376]	train-auc:0.872434	validation-auc:0.78872
[377]	train-auc:0.872572	validation-auc:0.788677
[378]	train-auc:0.872742	validation-auc:0.788694
[379]	train-auc:0.872918	validation-auc:0.788687
[380]	train-auc:0.873025	validation-auc:0.788723
[381]	train-auc:0.87309	validation-auc:0.788803
[382]	train-auc:0.873117	validation-auc:0.788808
[383]	train-auc:0.873255	validation-auc:0.78884
[384]	train-auc:0.873406	validation-auc:0.788879
[385]	train-auc:0.873503	validation-auc:0.788875
[386]	train-auc:0.873623	validation-auc:0.788865
[387]	train-auc:0.873658	validation-auc:0.788898
[388]	train-auc:0.873731	validation-auc:0.788909
[389]	train-auc:0.873835	validation-auc:0.788931
[390]	train-auc:0.873914	validation-auc:0.788943
[391]	train-auc:0.874012	validation-auc:0.788959
[392]	train-auc:0.874102	validation-auc:0.788964
[393]	train-auc:0.874131	validation-auc:0.788973
[394]	train-auc:0.8743	validation-auc:0.788975
[395]	train-auc:0.874393	validation-auc:0.78899
[396]	train-auc:0.874432	validation-auc:0.789009
[397]	train-auc:0.874526	validation-auc:0.789036
[398]	train-auc:0.874785	validation-auc:0.789022
[399]	train-auc:0.874933	validation-auc:0.789082
[400]	train-auc:0.875062	validation-auc:0.789134
[401]	train-auc:0.875061	validation-auc:0.789154
[402]	train-auc:0.875146	validation-auc:0.78916
[403]	train-auc:0.875335	validation-auc:0.789213
[404]	train-auc:0.875509	validation-auc:0.789223
[405]	train-auc:0.875693	validation-auc:0.789223
[406]	train-auc:0.875714	validation-auc:0.78923
[407]	train-auc:0.875726	validation-auc:0.789245
[408]	train-auc:0.875831	validation-auc:0.789242
[409]	train-auc:0.875891	validation-auc:0.789222
[410]	train-auc:0.876009	validation-auc:0.789218
[411]	train-auc:0.876101	validation-auc:0.789209
[412]	train-auc:0.876237	validation-auc:0.789205
[413]	train-auc:0.876564	validation-auc:0.789185
[414]	train-auc:0.876642	validation-auc:0.789159
[415]	train-auc:0.876768	validation-auc:0.789158
[416]	train-auc:0.876946	validation-auc:0.789195
[417]	train-auc:0.876986	validation-auc:0.7892
[418]	train-auc:0.877085	validation-auc:0.78921
[419]	train-auc:0.877112	validation-auc:0.78926
[420]	train-auc:0.877246	validation-auc:0.789258
[421]	train-auc:0.877298	validation-auc:0.789244
[422]	train-auc:0.877382	validation-auc:0.789265
[423]	train-auc:0.877441	validation-auc:0.789267
[424]	train-auc:0.877469	validation-auc:0.789272
[425]	train-auc:0.877545	validation-auc:0.789264
[426]	train-auc:0.877692	validation-auc:0.78928
[427]	train-auc:0.877797	validation-auc:0.789285
[428]	train-auc:0.877854	validation-auc:0.789288
[429]	train-auc:0.877855	validation-auc:0.789327
[430]	train-auc:0.877926	validation-auc:0.789393
[431]	train-auc:0.877984	validation-auc:0.789376
[432]	train-auc:0.878082	validation-auc:0.789381
[433]	train-auc:0.878105	validation-auc:0.789386
[434]	train-auc:0.87816	validation-auc:0.789375
[435]	train-auc:0.87825	validation-auc:0.789356
[436]	train-auc:0.878362	validation-auc:0.789397
[437]	train-auc:0.878377	validation-auc:0.789426
[438]	train-auc:0.878389	validation-auc:0.789454
[439]	train-auc:0.878449	validation-auc:0.789471
[440]	train-auc:0.878547	validation-auc:0.789502
[441]	train-auc:0.87859	validation-auc:0.789517
[442]	train-auc:0.878717	validation-auc:0.789527
[443]	train-auc:0.878801	validation-auc:0.78956
[444]	train-auc:0.878949	validation-auc:0.789597
[445]	train-auc:0.879004	validation-auc:0.789591
[446]	train-auc:0.879133	validation-auc:0.789587
[447]	train-auc:0.879261	validation-auc:0.789602
[448]	train-auc:0.879419	validation-auc:0.789616
[449]	train-auc:0.879541	validation-auc:0.789618
[450]	train-auc:0.879601	validation-auc:0.789629
[451]	train-auc:0.879612	validation-auc:0.789632
[452]	train-auc:0.87968	validation-auc:0.789619
[453]	train-auc:0.879793	validation-auc:0.789621
[454]	train-auc:0.879934	validation-auc:0.789628
[455]	train-auc:0.880056	validation-auc:0.78965
[456]	train-auc:0.880114	validation-auc:0.789666
[457]	train-auc:0.880249	validation-auc:0.789675
[458]	train-auc:0.880294	validation-auc:0.789707
[459]	train-auc:0.880342	validation-auc:0.78971
[460]	train-auc:0.880393	validation-auc:0.789699
[461]	train-auc:0.880474	validation-auc:0.78972
[462]	train-auc:0.88051	validation-auc:0.789729
[463]	train-auc:0.880537	validation-auc:0.789722
[464]	train-auc:0.880643	validation-auc:0.789722
[465]	train-auc:0.880668	validation-auc:0.789722
[466]	train-auc:0.880709	validation-auc:0.789723
[467]	train-auc:0.880742	validation-auc:0.789714
[468]	train-auc:0.88089	validation-auc:0.789726
[469]	train-auc:0.880978	validation-auc:0.789717
[470]	train-auc:0.88111	validation-auc:0.789717
[471]	train-auc:0.881144	validation-auc:0.78972
[472]	train-auc:0.881297	validation-auc:0.789738
[473]	train-auc:0.881342	validation-auc:0.789727
[474]	train-auc:0.881379	validation-auc:0.789722
[475]	train-auc:0.881474	validation-auc:0.789708
[476]	train-auc:0.881553	validation-auc:0.789707
[477]	train-auc:0.881671	validation-auc:0.78968
[478]	train-auc:0.881712	validation-auc:0.789669
[479]	train-auc:0.881856	validation-auc:0.789715
[480]	train-auc:0.881925	validation-auc:0.78973
[481]	train-auc:0.881928	validation-auc:0.789725
[482]	train-auc:0.88203	validation-auc:0.789724
[483]	train-auc:0.882102	validation-auc:0.789735
[484]	train-auc:0.882175	validation-auc:0.789731
[485]	train-auc:0.882264	validation-auc:0.789742
[486]	train-auc:0.882328	validation-auc:0.789762
[487]	train-auc:0.882395	validation-auc:0.789763
[488]	train-auc:0.882486	validation-auc:0.789754
[489]	train-auc:0.882539	validation-auc:0.789747
[490]	train-auc:0.882568	validation-auc:0.789746
[491]	train-auc:0.882632	validation-auc:0.789779
[492]	train-auc:0.88279	validation-auc:0.789789
[493]	train-auc:0.882807	validation-auc:0.789806
[494]	train-auc:0.88289	validation-auc:0.78979
[495]	train-auc:0.882973	validation-auc:0.7898
[496]	train-auc:0.883045	validation-auc:0.789821
[497]	train-auc:0.883255	validation-auc:0.789916
[498]	train-auc:0.883288	validation-auc:0.789949
[499]	train-auc:0.883448	validation-auc:0.789928
[500]	train-auc:0.88362	validation-auc:0.789949
[501]	train-auc:0.883667	validation-auc:0.789943
[502]	train-auc:0.883699	validation-auc:0.789962
[503]	train-auc:0.883821	validation-auc:0.789996
[504]	train-auc:0.883963	validation-auc:0.79001
[505]	train-auc:0.884022	validation-auc:0.790003
[506]	train-auc:0.884084	validation-auc:0.789986
[507]	train-auc:0.884285	validation-auc:0.790042
[508]	train-auc:0.884356	validation-auc:0.790018
[509]	train-auc:0.88444	validation-auc:0.790028
[510]	train-auc:0.884467	validation-auc:0.790022
[511]	train-auc:0.884521	validation-auc:0.790062
[512]	train-auc:0.884591	validation-auc:0.790058
[513]	train-auc:0.884617	validation-auc:0.790063
[514]	train-auc:0.884724	validation-auc:0.790096
[515]	train-auc:0.884757	validation-auc:0.790087
[516]	train-auc:0.884931	validation-auc:0.790097
[517]	train-auc:0.884987	validation-auc:0.790108
[518]	train-auc:0.885016	validation-auc:0.790111
[519]	train-auc:0.885085	validation-auc:0.790113
[520]	train-auc:0.885231	validation-auc:0.790165
[521]	train-auc:0.885271	validation-auc:0.790192
[522]	train-auc:0.885342	validation-auc:0.790168
[523]	train-auc:0.88543	validation-auc:0.790192
[524]	train-auc:0.885507	validation-auc:0.790219
[525]	train-auc:0.885593	validation-auc:0.790219
[526]	train-auc:0.885702	validation-auc:0.7902
[527]	train-auc:0.885758	validation-auc:0.790197
[528]	train-auc:0.885894	validation-auc:0.790197
[529]	train-auc:0.886002	validation-auc:0.790248
[530]	train-auc:0.886078	validation-auc:0.79024
[531]	train-auc:0.886181	validation-auc:0.790224
[532]	train-auc:0.886249	validation-auc:0.790231
[533]	train-auc:0.886334	validation-auc:0.790251
[534]	train-auc:0.886396	validation-auc:0.790243
[535]	train-auc:0.886433	validation-auc:0.790239
[536]	train-auc:0.886557	validation-auc:0.79022
[537]	train-auc:0.886621	validation-auc:0.790225
[538]	train-auc:0.886799	validation-auc:0.790227
[539]	train-auc:0.886858	validation-auc:0.790256
[540]	train-auc:0.886898	validation-auc:0.790251
[541]	train-auc:0.886932	validation-auc:0.790249
[542]	train-auc:0.887024	validation-auc:0.790253
[543]	train-auc:0.887177	validation-auc:0.790262
[544]	train-auc:0.887245	validation-auc:0.790276
[545]	train-auc:0.887273	validation-auc:0.790272
[546]	train-auc:0.887292	validation-auc:0.790277
[547]	train-auc:0.887343	validation-auc:0.790286
[548]	train-auc:0.887381	validation-auc:0.790309
[549]	train-auc:0.887498	validation-auc:0.79033
[550]	train-auc:0.887538	validation-auc:0.79031
[551]	train-auc:0.887585	validation-auc:0.790313
[552]	train-auc:0.88764	validation-auc:0.790299
[553]	train-auc:0.887691	validation-auc:0.790287
[554]	train-auc:0.887787	validation-auc:0.790288
[555]	train-auc:0.887847	validation-auc:0.790283
[556]	train-auc:0.887927	validation-auc:0.790307
[557]	train-auc:0.887952	validation-auc:0.790317
[558]	train-auc:0.888003	validation-auc:0.790338
[559]	train-auc:0.888078	validation-auc:0.790353
[560]	train-auc:0.888145	validation-auc:0.790369
[561]	train-auc:0.888255	validation-auc:0.790421
[562]	train-auc:0.888346	validation-auc:0.790431
[563]	train-auc:0.888364	validation-auc:0.790428
[564]	train-auc:0.888461	validation-auc:0.79043
[565]	train-auc:0.888573	validation-auc:0.790436
[566]	train-auc:0.88866	validation-auc:0.790458
[567]	train-auc:0.888751	validation-auc:0.790475
[568]	train-auc:0.888805	validation-auc:0.790488
[569]	train-auc:0.888862	validation-auc:0.790497
[570]	train-auc:0.88894	validation-auc:0.790522
[571]	train-auc:0.889076	validation-auc:0.790523
[572]	train-auc:0.889106	validation-auc:0.790528
[573]	train-auc:0.889195	validation-auc:0.790543
[574]	train-auc:0.889266	validation-auc:0.790562
[575]	train-auc:0.889336	validation-auc:0.790562
[576]	train-auc:0.889454	validation-auc:0.790529
[577]	train-auc:0.889492	validation-auc:0.7905
[578]	train-auc:0.889524	validation-auc:0.790486
[579]	train-auc:0.889549	validation-auc:0.790472
[580]	train-auc:0.88959	validation-auc:0.790477
[581]	train-auc:0.889679	validation-auc:0.790459
[582]	train-auc:0.889767	validation-auc:0.79047
[583]	train-auc:0.889824	validation-auc:0.790453
[584]	train-auc:0.889896	validation-auc:0.790482
[585]	train-auc:0.890045	validation-auc:0.7905
[586]	train-auc:0.890091	validation-auc:0.790505
[587]	train-auc:0.890117	validation-auc:0.790515
[588]	train-auc:0.890175	validation-auc:0.790526
[589]	train-auc:0.890267	validation-auc:0.79051
[590]	train-auc:0.890292	validation-auc:0.790504
[591]	train-auc:0.890338	validation-auc:0.790487
[592]	train-auc:0.890383	validation-auc:0.790486
[593]	train-auc:0.890403	validation-auc:0.790476
[594]	train-auc:0.890461	validation-auc:0.790479
[595]	train-auc:0.890616	validation-auc:0.790473
[596]	train-auc:0.890707	validation-auc:0.79045
[597]	train-auc:0.890787	validation-auc:0.790487
[598]	train-auc:0.890807	validation-auc:0.790496
[599]	train-auc:0.890854	validation-auc:0.790501
[600]	train-auc:0.890901	validation-auc:0.790513
[601]	train-auc:0.890986	validation-auc:0.790502
[602]	train-auc:0.891084	validation-auc:0.790503
[603]	train-auc:0.891134	validation-auc:0.790502
[604]	train-auc:0.891165	validation-auc:0.790518
[605]	train-auc:0.891231	validation-auc:0.790532
[606]	train-auc:0.891376	validation-auc:0.790551
[607]	train-auc:0.891488	validation-auc:0.790543
[608]	train-auc:0.891563	validation-auc:0.790549
[609]	train-auc:0.891656	validation-auc:0.790571
[610]	train-auc:0.891811	validation-auc:0.790546
[611]	train-auc:0.891909	validation-auc:0.790539
[612]	train-auc:0.892029	validation-auc:0.790526
[613]	train-auc:0.892098	validation-auc:0.79056
[614]	train-auc:0.892166	validation-auc:0.790561
[615]	train-auc:0.892215	validation-auc:0.790559
[616]	train-auc:0.892303	validation-auc:0.79057
[617]	train-auc:0.892339	validation-auc:0.790563
[618]	train-auc:0.892407	validation-auc:0.790593
[619]	train-auc:0.892442	validation-auc:0.790588
[620]	train-auc:0.892506	validation-auc:0.790589
[621]	train-auc:0.892591	validation-auc:0.790619
[622]	train-auc:0.892751	validation-auc:0.790639
[623]	train-auc:0.892826	validation-auc:0.790649
[624]	train-auc:0.892855	validation-auc:0.790662
[625]	train-auc:0.892928	validation-auc:0.790679
[626]	train-auc:0.892966	validation-auc:0.790682
[627]	train-auc:0.892991	validation-auc:0.790688
[628]	train-auc:0.893031	validation-auc:0.790699
[629]	train-auc:0.893115	validation-auc:0.790693
[630]	train-auc:0.893212	validation-auc:0.790683
[631]	train-auc:0.893247	validation-auc:0.79068
[632]	train-auc:0.893382	validation-auc:0.790688
[633]	train-auc:0.893414	validation-auc:0.790684
[634]	train-auc:0.893432	validation-auc:0.790691
[635]	train-auc:0.893464	validation-auc:0.790675
[636]	train-auc:0.893553	validation-auc:0.790683
[637]	train-auc:0.893676	validation-auc:0.790679
[638]	train-auc:0.893725	validation-auc:0.790675
[639]	train-auc:0.893779	validation-auc:0.790714
[640]	train-auc:0.893804	validation-auc:0.790716
[641]	train-auc:0.893921	validation-auc:0.790723
[642]	train-auc:0.893986	validation-auc:0.790745
[643]	train-auc:0.894005	validation-auc:0.790756
[644]	train-auc:0.894146	validation-auc:0.790745
[645]	train-auc:0.894285	validation-auc:0.790742
[646]	train-auc:0.894375	validation-auc:0.790758
[647]	train-auc:0.894477	validation-auc:0.790753
[648]	train-auc:0.894526	validation-auc:0.790742
[649]	train-auc:0.894608	validation-auc:0.790727
[650]	train-auc:0.894603	validation-auc:0.790731
[651]	train-auc:0.894612	validation-auc:0.790728
[652]	train-auc:0.894693	validation-auc:0.790727
[653]	train-auc:0.894767	validation-auc:0.790715
[654]	train-auc:0.894803	validation-auc:0.79072
[655]	train-auc:0.894902	validation-auc:0.790739
[656]	train-auc:0.894975	validation-auc:0.790754
[657]	train-auc:0.895006	validation-auc:0.79075
[658]	train-auc:0.895072	validation-auc:0.790761
[659]	train-auc:0.895176	validation-auc:0.790745
[660]	train-auc:0.895286	validation-auc:0.790738
[661]	train-auc:0.895377	validation-auc:0.790716
[662]	train-auc:0.895452	validation-auc:0.790717
[663]	train-auc:0.895546	validation-auc:0.790732
[664]	train-auc:0.895609	validation-auc:0.790748
[665]	train-auc:0.895673	validation-auc:0.790738
[666]	train-auc:0.89584	validation-auc:0.790719
[667]	train-auc:0.895916	validation-auc:0.790726
[668]	train-auc:0.895981	validation-auc:0.79072
[669]	train-auc:0.896059	validation-auc:0.790713
[670]	train-auc:0.896102	validation-auc:0.790722
[671]	train-auc:0.896147	validation-auc:0.790752
[672]	train-auc:0.896243	validation-auc:0.790725
[673]	train-auc:0.896276	validation-auc:0.79073
[674]	train-auc:0.896324	validation-auc:0.790714
[675]	train-auc:0.896371	validation-auc:0.790715
[676]	train-auc:0.896511	validation-auc:0.79073
[677]	train-auc:0.896568	validation-auc:0.790696
[678]	train-auc:0.896591	validation-auc:0.790685
[679]	train-auc:0.89662	validation-auc:0.790689
[680]	train-auc:0.89672	validation-auc:0.790703
[681]	train-auc:0.89678	validation-auc:0.790728
[682]	train-auc:0.896865	validation-auc:0.790708
[683]	train-auc:0.896894	validation-auc:0.790705
[684]	train-auc:0.896985	validation-auc:0.790697
[685]	train-auc:0.897054	validation-auc:0.7907
[686]	train-auc:0.897072	validation-auc:0.79069
[687]	train-auc:0.89715	validation-auc:0.790681
[688]	train-auc:0.89724	validation-auc:0.790684
[689]	train-auc:0.897312	validation-auc:0.790691
[690]	train-auc:0.897352	validation-auc:0.790682
[691]	train-auc:0.897401	validation-auc:0.79066
[692]	train-auc:0.897497	validation-auc:0.790651
[693]	train-auc:0.897538	validation-auc:0.790654
[694]	train-auc:0.897615	validation-auc:0.790654
[695]	train-auc:0.897635	validation-auc:0.790672
[696]	train-auc:0.897747	validation-auc:0.790711
[697]	train-auc:0.897878	validation-auc:0.790722
[698]	train-auc:0.89791	validation-auc:0.790733
[699]	train-auc:0.897978	validation-auc:0.790764
[700]	train-auc:0.898138	validation-auc:0.790749
[701]	train-auc:0.898187	validation-auc:0.790753
[702]	train-auc:0.898284	validation-auc:0.790738
[703]	train-auc:0.898297	validation-auc:0.790745
[704]	train-auc:0.898404	validation-auc:0.790752
[705]	train-auc:0.898516	validation-auc:0.790732
[706]	train-auc:0.898595	validation-auc:0.790726
[707]	train-auc:0.898643	validation-auc:0.790724
[708]	train-auc:0.898667	validation-auc:0.790739
[709]	train-auc:0.89873	validation-auc:0.790754
[710]	train-auc:0.89886	validation-auc:0.79075
[711]	train-auc:0.898889	validation-auc:0.790746
[712]	train-auc:0.898954	validation-auc:0.790746
[713]	train-auc:0.898976	validation-auc:0.79074
[714]	train-auc:0.899109	validation-auc:0.790749
[715]	train-auc:0.8992	validation-auc:0.790775
[716]	train-auc:0.899246	validation-auc:0.790759
[717]	train-auc:0.899362	validation-auc:0.790758
[718]	train-auc:0.899428	validation-auc:0.790757
[719]	train-auc:0.899495	validation-auc:0.790759
[720]	train-auc:0.899612	validation-auc:0.790778
[721]	train-auc:0.899704	validation-auc:0.790799
[722]	train-auc:0.899779	validation-auc:0.790801
[723]	train-auc:0.899857	validation-auc:0.790789
[724]	train-auc:0.899947	validation-auc:0.790777
[725]	train-auc:0.900039	validation-auc:0.790781
[726]	train-auc:0.900098	validation-auc:0.790778
[727]	train-auc:0.90016	validation-auc:0.790766
[728]	train-auc:0.900233	validation-auc:0.790751
[729]	train-auc:0.90026	validation-auc:0.790749
[730]	train-auc:0.900349	validation-auc:0.790745
[731]	train-auc:0.900455	validation-auc:0.790777
[732]	train-auc:0.90055	validation-auc:0.790791
[733]	train-auc:0.900632	validation-auc:0.790818
[734]	train-auc:0.900695	validation-auc:0.790803
[735]	train-auc:0.900705	validation-auc:0.790825
[736]	train-auc:0.900847	validation-auc:0.790853
[737]	train-auc:0.900945	validation-auc:0.790853
[738]	train-auc:0.901024	validation-auc:0.790848
[739]	train-auc:0.901168	validation-auc:0.790844
[740]	train-auc:0.901193	validation-auc:0.790859
[741]	train-auc:0.901211	validation-auc:0.790862
[742]	train-auc:0.901224	validation-auc:0.790865
[743]	train-auc:0.901261	validation-auc:0.790872
[744]	train-auc:0.901361	validation-auc:0.790895
[745]	train-auc:0.9014	validation-auc:0.790881
[746]	train-auc:0.901449	validation-auc:0.790866
[747]	train-auc:0.901508	validation-auc:0.790859
[748]	train-auc:0.901564	validation-auc:0.790867
[749]	train-auc:0.901629	validation-auc:0.790861
[750]	train-auc:0.901746	validation-auc:0.790875
[751]	train-auc:0.901775	validation-auc:0.790871
[752]	train-auc:0.901847	validation-auc:0.790868
[753]	train-auc:0.901915	validation-auc:0.790878
[754]	train-auc:0.902034	validation-auc:0.790873
[755]	train-auc:0.90219	validation-auc:0.790883
[756]	train-auc:0.902226	validation-auc:0.790882
[757]	train-auc:0.902273	validation-auc:0.790903
[758]	train-auc:0.902351	validation-auc:0.790894
[759]	train-auc:0.902393	validation-auc:0.790891
[760]	train-auc:0.902437	validation-auc:0.790885
[761]	train-auc:0.902463	validation-auc:0.790888
[762]	train-auc:0.902521	validation-auc:0.790864
[763]	train-auc:0.902553	validation-auc:0.790863
[764]	train-auc:0.902578	validation-auc:0.790859
[765]	train-auc:0.902624	validation-auc:0.790863
[766]	train-auc:0.90268	validation-auc:0.790835
[767]	train-auc:0.902817	validation-auc:0.790832
[768]	train-auc:0.902855	validation-auc:0.790838
[769]	train-auc:0.902881	validation-auc:0.790842
[770]	train-auc:0.902951	validation-auc:0.790841
[771]	train-auc:0.90302	validation-auc:0.79084
[772]	train-auc:0.903049	validation-auc:0.790844
[773]	train-auc:0.903132	validation-auc:0.790856
[774]	train-auc:0.903183	validation-auc:0.790829
[775]	train-auc:0.9032	validation-auc:0.790838
[776]	train-auc:0.903276	validation-auc:0.790817
[777]	train-auc:0.903313	validation-auc:0.790811
[778]	train-auc:0.90334	validation-auc:0.79081
[779]	train-auc:0.903482	validation-auc:0.790833
[780]	train-auc:0.903536	validation-auc:0.79083
[781]	train-auc:0.903559	validation-auc:0.790839
[782]	train-auc:0.903625	validation-auc:0.79084
[783]	train-auc:0.903681	validation-auc:0.79084
[784]	train-auc:0.903702	validation-auc:0.790845
[785]	train-auc:0.903747	validation-auc:0.790832
[786]	train-auc:0.903799	validation-auc:0.790837
[787]	train-auc:0.903826	validation-auc:0.790842
[788]	train-auc:0.903945	validation-auc:0.790855
[789]	train-auc:0.904029	validation-auc:0.790851
[790]	train-auc:0.90404	validation-auc:0.790852
[791]	train-auc:0.904203	validation-auc:0.790821
[792]	train-auc:0.904233	validation-auc:0.790814
[793]	train-auc:0.904345	validation-auc:0.790814
[794]	train-auc:0.904356	validation-auc:0.79082
[795]	train-auc:0.904428	validation-auc:0.790814
[796]	train-auc:0.90451	validation-auc:0.79083
[797]	train-auc:0.904548	validation-auc:0.790834
[798]	train-auc:0.904584	validation-auc:0.790837
[799]	train-auc:0.904602	validation-auc:0.790837
[800]	train-auc:0.90471	validation-auc:0.79085
[801]	train-auc:0.904751	validation-auc:0.790843
[802]	train-auc:0.90481	validation-auc:0.790831
[803]	train-auc:0.904925	validation-auc:0.790837
[804]	train-auc:0.904957	validation-auc:0.79083
[805]	train-auc:0.905018	validation-auc:0.79084
[806]	train-auc:0.905107	validation-auc:0.790834
[807]	train-auc:0.905188	validation-auc:0.790838
Stopping. Best iteration:
[757]	train-auc:0.902273	validation-auc:0.790903

757
In [105]:
# Use all the train data to train the model

X_train_matrix = features.values

#SKLEARN
clf_xgb = xgb.XGBClassifier(silent = params['silent'],
                            learning_rate = params['eta'],  
                            max_depth = params['max_depth'], 
                            subsample = params['subsample'], 
                            colsample_bytree = params['colsample_bytree'],
                            min_child_weight = params['min_child_weight'], 
                            objective = params['objective'], 
                            n_estimators = num_rounds,
                            seed = params['seed'],
                            nthread = params['nthread'],
                            gamma = params['gamma']
                                            )
clf_xgb.fit(X_train_matrix, 
            target, 
            eval_metric ='auc')
Out[105]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.1, learning_rate=0.01,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=757, n_jobs=1, nthread=-1, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=2017, silent=1, subsample=0.7)
In [106]:
# Preprocessing test set
test_X = Test.copy()
# Preprocessing

# target = plu_n_test.Target

drop = ['LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'ApprovalFY', 'ChgOffDate', 'DisbursementDate',
        'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'SBA_ratio',
        'default', 'FranchiseCode', 'Term', 'NAICS']

categorical = ['City', 'State', 'Zip5d', 'Zip3d', 'Bank', 'BankState', 'RevLineCr', 
               'LowDoc', 'NAICS_group', 'suffix', 
               'Expanding_ratio', 'Retaining_ratio'
              ]

test_bas = extract_test_features(test_X,
                                 drop,
                                 categorical,
                                 dict_categorical)
-----> Extract test features <------
dropping unwanted columns
transforming categorical variabless
done
In [107]:
# Prediction

for col in features.columns:
    if col not in test_bas.columns:
        print 'MISSING COLUMN: ',col
        
test_bas= test_bas[features.columns]
X_test_matrix = test_bas.values
print X_train_matrix.shape, X_test_matrix.shape

y_pred_xgb = clf_xgb.predict_proba(X_test_matrix)
temp = pd.DataFrame(y_pred_xgb)
(94646, 24) (31549, 24)
In [108]:
kernix_check = test_X[['LoanNr_ChkDgt', 'Name', 'ApprovalFY', 'State', 'default', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'SBA_ratio']]
kernix_check.loc[:, 'prob'] = y_pred_xgb[:,1]
In [109]:
kernix_check.head()
Out[109]:
LoanNr_ChkDgt Name ApprovalFY State default ChgOffPrinGr GrAppv SBA_Appv SBA_ratio prob
104760 7461474001 RELIANCE DATA INC 2004 VA 0 0.0 100000.0 50000.0 0.50 0.109927
24578 6117344004 GILSON'S AUTO BODY SHOP 2003 NM 0 0.0 50000.0 25000.0 0.50 0.122449
124772 7824464009 ALL ABOUT KIDS INC 2004 ND 0 0.0 25000.0 12500.0 0.50 0.058884
20754 6042984002 HCD International, Inc. 2003 MD 0 0.0 100000.0 50000.0 0.50 0.254222
28924 6184124008 PICKERINGTON EYE CARE, INC. 2003 OH 0 0.0 80000.0 68000.0 0.85 0.112118
In [110]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn
import matplotlib.pyplot as plt
seaborn.set_style('darkgrid')

def __to_percent1(y, position):
    y = y * 100.0
    return "{:.1f}%".format(y)

def plot_roc(target, predicted_proba, title, save_png=''):
        import matplotlib.pyplot as plt
        import matplotlib.ticker as mtick
        from sklearn.metrics import roc_curve, roc_auc_score

        fpr, tpr, _ = roc_curve(target, predicted_proba)
        auc_plot = roc_auc_score(target, predicted_proba)        
        plt.figure()
        plt.plot(fpr, tpr, '-', alpha=.8, color='red', lw=1.5, label= title + ' (auc = %0.3f)' % auc_plot)
        plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Chance')

        plt.xlim([0.0, 1.01])
        plt.ylim([0.0, 1.01])
        plt.gca().xaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
        plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
        plt.xlabel('Non default cases', fontsize=15)
        plt.ylabel('Default cases', fontsize=15)

        plt.title("\nROC curve - {}\n".format(title), fontsize=18)
        plt.legend(loc="lower right", fontsize=15)
        
        if save_png != '':
                plt.savefig(save_png, format="png")
        else:
                plt.show()
In [111]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 10)
plot_roc(kernix_check.default, kernix_check.prob, 'Test')
Populating the interactive namespace from numpy and matplotlib
In [112]:
# Feature Importance

#BOOSTER
dtrain_ex=xgb.DMatrix(features.values,
                      label=target.values,
                      feature_names=features.columns)

bst_ex=xgb.train(params,
                 dtrain_ex,
                 num_boost_round=bst.best_iteration,
                 verbose_eval=False
                )
bst_ex.feature_names[:10]
Out[112]:
['NoEmp',
 'NewExist',
 'CreateJob',
 'RetainedJob',
 'UrbanRural',
 'RealEstate',
 'NAICS_default_rate',
 'Loan_age',
 'Previous_loan',
 'default_times']
In [113]:
def plot_features_importance(bst):
    x = bst.get_fscore()
    sorted_x = sorted(x.items(), key=lambda x: x[1], reverse=True)
    keys_max = [item[0] for item in sorted_x[:30]]
    feat_max = {key: x[key] for key in keys_max}
    fig, ax = plt.subplots(1, 1, figsize=(20, 15))
    xgb.plot_importance(feat_max, ax=ax)
    
def print_features_importance(bst):
    x = bst.get_fscore()
    sorted_x = sorted(x.items(), key=lambda x: x[1], reverse=True)
    keys_max = [item[0] for item in sorted_x[:30]]
    feat_max = {key: x[key] for key in keys_max}
    features_importance = pd.DataFrame([feat_max]).T
    features_importance = features_importance.rename(columns = {0: 'Score'})
    features_importance = features_importance.sort_values('Score', ascending=False)
    return features_importance
In [114]:
plot_features_importance(bst_ex)
In [115]:
feat_max = print_features_importance(bst_ex)
feat_max.rename(columns = {'Score': 'Accumlated score'}).head(15)
Out[115]:
Accumlated score
Zip5d_INT 36279
City_INT 27679
Bank_INT 18481
NoEmp 16610
Zip3d_INT 13513
RetainedJob 12770
NAICS_group_INT 11165
BankState_INT 10945
State_INT 10855
NAICS_default_rate 10550
suffix_INT 9447
CreateJob 9099
UrbanRural 2773
NewExist 2629
RevLineCr_INT 2020
In [116]:
# Tuning grades

kernix_check.prob.hist(bins = 100)
Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea819e5750>
In [117]:
np.log(kernix_check.prob).hist(bins = 100)
Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea8189d690>
In [118]:
def tuning_grades(num_grades, prob):
    Percentile = list(np.linspace(0, 100, num_grades+1))
    thresholds = [np.percentile(prob, i) for i in Percentile]
    thresholds[0] = 0
    thresholds[-1] = 1
    thresholds = [round(i, 3) for i in thresholds]
    return thresholds

prob_th = tuning_grades(5, kernix_check.prob)
prob_th
Out[118]:
[0.0, 0.051, 0.105, 0.152, 0.23, 1.0]
In [119]:
grades = [str(g) for g in range(1,6)]

kernix_check.loc[:, 'Grade'] = pd.cut(kernix_check.prob, bins=prob_th, labels=grades)
kernix_check.loc[:, 'Grade'] = kernix_check['Grade'].astype('int')
In [120]:
kernix_check.Grade.value_counts().sort_index()
Out[120]:
1    6345
2    6332
3    6236
4    6319
5    6317
Name: Grade, dtype: int64
In [121]:
def plot_grade_roc(target, grade, predicted_proba, title, save_png=''):
        import matplotlib.pyplot as plt
        import matplotlib.ticker as mtick
        from sklearn.metrics import roc_curve, roc_auc_score

        fpr, tpr, _ = roc_curve(target, predicted_proba)
        fpr_plot, tpr_plot, _ = roc_curve(target, grade)
        raw_auc_plot = roc_auc_score(target, predicted_proba)
        new_grade_auc_plot = roc_auc_score(target, grade)

        plt.figure()
        plt.plot(fpr, tpr, '-', color='grey', alpha=.3, label="Raw PD (auc = %0.3f)" % raw_auc_plot)
        plt.plot(fpr_plot, tpr_plot, 'o-', color='red', alpha=.8, lw=1.5, label= title + ' (auc = %0.3f)' % new_grade_auc_plot)
        plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Chance')

        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.gca().xaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
        plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
        plt.xlabel('Non wasted policies', fontsize=15)
        plt.ylabel('Wasted policies', fontsize=15)

        plt.title("\nROC curve - {}\n".format(title), fontsize=18)
        plt.legend(loc="lower right", fontsize=15)

        bbox_props = dict(boxstyle="circle,pad=0.3", fc="white", ec="#2769a6", lw=1)
        bbox_props2 = dict(boxstyle="circle,pad=0.3", fc="white", ec="red", lw=1)
        bbox_props3 = dict(boxstyle="circle,pad=0.3", fc="white", ec="blue", lw=1)
        
        for i in range(0,6):
                if i >= 1 and i <= 6:
                        try:
                                plt.text(fpr_plot[i] - .01, tpr_plot[i] + .05, "%s" % (6 - i), color="red", ha="center", va="center", size=15, bbox=bbox_props2)
                        except:
                                pass
        
        if save_png != '':
                plt.savefig(save_png, format="png")
        plt.show()
In [122]:
plot_grade_roc(kernix_check.default, kernix_check.Grade, kernix_check.prob, 'Test')
In [123]:
kernix_check.groupby('Grade').default.sum()/kernix_check.Grade.value_counts()
Out[123]:
1    0.012924
2    0.075648
3    0.123958
4    0.197500
5    0.411746
dtype: float64

Financial Analysis

In [124]:
kernix_check.head()
Out[124]:
LoanNr_ChkDgt Name ApprovalFY State default ChgOffPrinGr GrAppv SBA_Appv SBA_ratio prob Grade
104760 7461474001 RELIANCE DATA INC 2004 VA 0 0.0 100000.0 50000.0 0.50 0.109927 3
24578 6117344004 GILSON'S AUTO BODY SHOP 2003 NM 0 0.0 50000.0 25000.0 0.50 0.122449 3
124772 7824464009 ALL ABOUT KIDS INC 2004 ND 0 0.0 25000.0 12500.0 0.50 0.058884 2
20754 6042984002 HCD International, Inc. 2003 MD 0 0.0 100000.0 50000.0 0.50 0.254222 5
28924 6184124008 PICKERINGTON EYE CARE, INC. 2003 OH 0 0.0 80000.0 68000.0 0.85 0.112118 3
In [125]:
kernix_check.groupby('Grade').ChgOffPrinGr.sum()
Out[125]:
Grade
1     14832882.0
2     44744486.0
3     55736517.0
4     97815560.0
5    106576583.0
Name: ChgOffPrinGr, dtype: float64

For the banks, if they use machine learning to classify the loan then they may avoid some default especially for grade 5 companies.

In [126]:
def bank_loss(chgoffamount, sba_app):
    if chgoffamount == 0:
        return 0
    else:
        loss = chgoffamount - sba_app
        if loss < 0:
            loss = 0
        return loss
        
kernix_check.loc[:, 'Bank_loss'] = kernix_check.apply(lambda x: bank_loss(x['ChgOffPrinGr'], x['SBA_Appv']), axis = 1)
In [127]:
kernix_check.groupby('Grade').Bank_loss.sum()
Out[127]:
Grade
1      885462.0
2     5129314.0
3     8244013.0
4    12404486.0
5    19440734.0
Name: Bank_loss, dtype: float64

Given that some default is covered bny SBA, the actual loss for banks for each grade is much lower!

National SBA

(United States Small Business Administration)

SBA is a government agency and their objective is to help small business go through difficult period.

Let's change the role of the SBA, let say they are an credit insurance provider and let's say they get 10% of providing insurance to the amount they insure.

In [128]:
kernix_check.groupby('Grade').SBA_ratio.mean()
Out[128]:
Grade
1    0.770335
2    0.628223
3    0.618905
4    0.633864
5    0.629425
Name: SBA_ratio, dtype: float64

It seem like the SBA is able to pick good company to guarantee loan but fail to avoid claim.

In [129]:
# Let's say the premium is 10% of the insure amount

kernix_check.groupby('Grade').SBA_ratio.mean()
Out[129]:
Grade
1    0.770335
2    0.628223
3    0.618905
4    0.633864
5    0.629425
Name: SBA_ratio, dtype: float64
In [130]:
def sba_claim(chgoffamount, sba_app):
    if chgoffamount == 0:
        return 0
    else:
        if sba_app <= chgoffamount:
            claim = sba_app
        else:
            claim = chgoffamount
        return claim
    
kernix_check.loc[:, 'SBA_claim'] = kernix_check.apply(lambda x: sba_claim(x['ChgOffPrinGr'], x['SBA_Appv']), axis = 1)
In [131]:
# claim

kernix_check.groupby('Grade').SBA_claim.sum()
Out[131]:
Grade
1    13947420.0
2    39615172.0
3    47492504.0
4    85411074.0
5    87135849.0
Name: SBA_claim, dtype: float64
In [132]:
# Premium

print 'Premium in each grade ' 
kernix_check.groupby('Grade').SBA_Appv.sum()*0.1
Premium in each grade 
Out[132]:
Grade
1    212578225.8
2     62825865.3
3     49919188.2
4     51494777.5
5     32302182.1
Name: SBA_Appv, dtype: float64
In [133]:
# Gain and loss in each grade

kernix_check.groupby('Grade').SBA_Appv.sum()*0.1 - kernix_check.groupby('Grade').SBA_claim.sum()
Out[133]:
Grade
1    198630805.8
2     23210693.3
3      2426684.2
4    -33916296.5
5    -54833666.9
dtype: float64
In [134]:
# Total premium:
print 'Total premium: ', kernix_check.SBA_Appv.sum()*0.1
print 'Total claim  : ', kernix_check.SBA_claim.sum()
print 'Net profit:    ', kernix_check.SBA_Appv.sum()*0.1 - kernix_check.SBA_claim.sum()
Total premium:  409120238.90000004
Total claim  :  273602019.0
Net profit:     135518219.90000004

Define new SBA ratio with machine learning grades

In [135]:
# Test.SBA_ratio.value_counts().sort_index()

def sba_ratio_ml(grade):
    new_ratio = {1: 1,
                 2: 0.8,
                 3: 0.6,
                 4: 0.4,
                 5: 0.2}
    return new_ratio[grade]

kernix_check.loc[:, 'SBA_ratio_ml'] = kernix_check.Grade.apply(sba_ratio_ml)
kernix_check.loc[:, 'SBA_Appv_ml'] = kernix_check.GrAppv * kernix_check.SBA_ratio_ml
In [136]:
# claim

kernix_check.loc[:, 'SBA_claim_ml'] = kernix_check.apply(lambda x: sba_claim(x['ChgOffPrinGr'], x['SBA_Appv_ml']), axis = 1)

kernix_check.groupby('Grade').SBA_claim_ml.sum()
Out[136]:
Grade
1    14832882.0
2    42896691.0
3    45378904.2
4    59417260.4
5    31547305.8
Name: SBA_claim_ml, dtype: float64
In [137]:
# Premium

print 'Premium in each grade ' 
kernix_check.groupby('Grade').SBA_Appv_ml.sum()*0.1
Premium in each grade 
Out[137]:
Grade
1    2.606351e+08
2    7.201486e+07
3    4.294437e+07
4    2.894439e+07
5    9.450138e+06
Name: SBA_Appv_ml, dtype: float64
In [138]:
# Gain and loss in each grade

kernix_check.groupby('Grade').SBA_Appv_ml.sum()*0.1 - kernix_check.groupby('Grade').SBA_claim_ml.sum()
Out[138]:
Grade
1    2.458022e+08
2    2.911816e+07
3   -2.434533e+06
4   -3.047287e+07
5   -2.209717e+07
dtype: float64
In [139]:
# Total premium:
print 'Total premium: ', kernix_check.SBA_Appv_ml.sum()*0.1
print 'Total claim  : ', kernix_check.SBA_claim_ml.sum()
print 'Net profit:    ', kernix_check.SBA_Appv_ml.sum()*0.1 - kernix_check.SBA_claim_ml.sum()
Total premium:  413988864.90000004
Total claim  :  194073043.39999998
Net profit:     219915821.50000006

Chart

In [140]:
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode()
cf.go_offline()
In [141]:
(kernix_check.groupby('Grade')[['SBA_Appv', 'SBA_Appv_ml']].sum()*0.1).iplot(kind = 'bar',
title = 'Premium change with machine learning', yTitle = 'USD', xTitle = 'Grades')
In [142]:
(kernix_check.groupby('Grade')[['SBA_claim', 'SBA_claim_ml']].sum()).iplot(kind = 'bar',
title = 'Claim change with machine learning', yTitle = 'USD', xTitle = 'Grades')

Result explaination

In [143]:
import eli5
In [144]:
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="weight")
Out[144]:
Weight Feature
0.1802 Zip5d_INT
0.1375 City_INT
0.0918 Bank_INT
0.0825 NoEmp
0.0671 Zip3d_INT
0.0634 RetainedJob
0.0555 NAICS_group_INT
0.0544 BankState_INT
0.0539 State_INT
0.0524 NAICS_default_rate
0.0469 suffix_INT
0.0452 CreateJob
0.0138 UrbanRural
0.0131 NewExist
0.0100 RevLineCr_INT
0.0088 Loan_age
0.0065 LowDoc_INT
0.0061 Expanding
0.0030 default_times
0.0029 Previous_loan
… 4 more …
In [145]:
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="gain")
Out[145]:
Weight Feature
0.6346 RealEstate
0.0721 default_times
0.0593 RevLineCr_INT
0.0264 Bank_INT
0.0241 UrbanRural
0.0226 BankState_INT
0.0152 Previous_loan
0.0136 Loan_age
0.0115 State_INT
0.0110 LowDoc_INT
0.0106 Zip5d_INT
0.0102 Zip3d_INT
0.0102 NAICS_default_rate
0.0097 NewExist
0.0092 Expanding
0.0092 Retaining
0.0090 RetainedJob
0.0086 NAICS_group_INT
0.0083 suffix_INT
0.0082 CreateJob
… 4 more …
In [146]:
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="cover")
Out[146]:
Weight Feature
0.4101 RealEstate
0.2095 default_times
0.0684 Previous_loan
0.0488 RevLineCr_INT
0.0442 Loan_age
0.0264 UrbanRural
0.0250 BankState_INT
0.0240 Bank_INT
0.0159 NAICS_default_rate
0.0134 LowDoc_INT
0.0114 NewExist
0.0111 suffix_INT
0.0109 NAICS_group_INT
0.0109 Zip5d_INT
0.0108 State_INT
0.0100 Zip3d_INT
0.0096 City_INT
0.0087 CreateJob
0.0087 RetainedJob
0.0085 NoEmp
… 4 more …

Explain individual

In [71]:
eli5.show_prediction(bst_ex, test_bas.iloc[iloc], show_feature_values=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-71-6faaee1cd157> in <module>()
----> 1 eli5.show_prediction(bst_ex, test_bas.iloc[iloc], show_feature_values=True)

NameError: name 'iloc' is not defined
In [193]:
# kernix_check.head()
Test[Test.default == 1].head()
Out[193]:
LoanNr_ChkDgt Name City State Zip5d Zip3d Bank BankState NAICS NAICS_group ... DisbursementDate DisbursementGross BalanceGross MIS_Status ChgOffPrinGr GrAppv SBA_Appv default RealEstate SBA_ratio
71264 6920774003 RLB SADDLERY SCOTT (TOWNSHIP OF) PA 15129 151 CITIZENS BANK NATL ASSOC RI 423910 42 ... 2003-12-31 55171.0 0.0 CHGOFF 27659.0 34000.0 17000.0 1 0 0.50
95879 7338694002 EDDIE'S AUTO PARTS AND GLASS NEWPORT AR 72112 721 CAPITAL ONE NATL ASSOC VA 423120 42 ... 2004-04-30 50000.0 0.0 CHGOFF 38238.0 50000.0 25000.0 1 0 0.50
74360 6986774006 BEARS LANDSCAPING & YARD DETAI MANCHESTER NH 3103 31 CITIZENS BANK NATL ASSOC RI 541320 54 ... 2004-01-31 35177.0 0.0 CHGOFF 30821.0 35000.0 17500.0 1 0 0.50
1247 1020445007 G.W. Pools & Spas, Inc. Salisbury MA 1952 19 TD BANK, NATIONAL ASSOCIATION DE 238990 23 ... 2004-08-31 30700.0 0.0 CHGOFF 14652.0 30700.0 15350.0 1 0 0.50
44214 6411294003 CEOPHAS & ASSOCIATES LAUREL ACRES MD 20724 207 BBCN BANK CA 531210 53 ... 2003-06-30 5000.0 0.0 CHGOFF 840.0 5000.0 4250.0 1 0 0.85

5 rows × 33 columns

In [183]:
Test[Test.LoanNr_ChkDgt == 7461474001].iloc[[0]].index
Out[183]:
Int64Index([104760], dtype='int64')
In [203]:
def explain_grade(loan_number, show_data = True, chart = True, model = bst_ex,
                  testset = Test, feeding_data = test_bas, result = kernix_check):
    testset = testset[testset.LoanNr_ChkDgt == loan_number]
    feeding_data = feeding_data.loc[testset.index]
    result = result[result.LoanNr_ChkDgt == loan_number]
#     display(testset)
#     display(feeding_data)
#     display(result)
#     display(eli5.show_prediction(model, feeding_data.iloc[0], show_feature_values=True))

    df = eli5.explain_prediction_df(model, feeding_data.iloc[0])
    feature_groups = {'Location': ['City_INT', 'UrbanRural', 'Zip3d_INT', 'Zip5d_INT', 'State_INT'],
                      'Sector': ['NAICS_group_INT', 'NAICS_default_rate'],
                      'Business': ['RealEstate', 'Franchise', 'NewExist'],
                      'Employees': ['CreateJob', 'RetainedJob', 'NoEmp'],
                      'Loan': ['BankState_INT', 'Bank_INT', 'LowDoc_INT', 'RevLineCr_INT']
                      }
    group_contribution = {}
    for k in feature_groups.keys():
        group_contribution[k] = df[df.feature.isin(feature_groups[k])].weight.sum()
    return group_contribution
x = explain_grade(6920774003)
LoanNr_ChkDgt Name ApprovalFY State default ChgOffPrinGr GrAppv SBA_Appv SBA_ratio prob Grade Bank_loss SBA_claim SBA_ratio_ml SBA_Appv_ml SBA_claim_ml
71264 6920774003 RLB SADDLERY 2004 PA 1 27659.0 34000.0 17000.0 0.5 0.19008 4 10659.0 17000.0 0.4 13600.0 13600.0
In [204]:
x
Out[204]:
{'Business': 0.14575622285673298,
 'Employees': 0.16597186327805954,
 'Loan': 0.046952147119562387,
 'Location': -0.055741784709688172,
 'Sector': -0.081251933705464824}
In [198]:
feature_groups = {'Location': ['City_INT', 'UrbanRural', 'Zip3d_INT', 'Zip5d_INT', 'State_INT'],
                  'Sector': ['NAICS_group_INT', 'NAICS_default_rate'],
                  'Business': ['RealEstate', 'Franchise', 'NewExist'],
                  'Employees': ['CreateJob', 'RetainedJob', 'NoEmp'],
                  'Loan': ['BankState_INT', 'Bank_INT', 'LowDoc_INT', 'RevLineCr_INT']
                  }
In [201]:
x[x.feature.isin(feature_groups['Location'])].weight.sum()
Out[201]:
-0.055741784709688172
In [167]:
test_bas.head()
Out[167]:
NAICS_default_rate NoEmp NewExist CreateJob RetainedJob UrbanRural RealEstate Franchise City_INT State_INT Zip5d_INT Zip3d_INT Bank_INT BankState_INT RevLineCr_INT LowDoc_INT NAICS_group_INT
104760 19.0 2 1.0 6 2 1 0 0 274.0 45 4099.0 200.0 194.0 28 1 0.0 12
24578 20.0 2 1.0 2 2 1 0 0 84.0 32 13889.0 800.0 194.0 28 1 0.0 19
124772 10.0 2 2.0 3 5 1 0 0 3375.0 28 9707.0 539.0 72.0 29 1 0.0 16
20754 19.0 30 1.0 2 32 1 0 0 4672.0 20 3858.0 186.0 194.0 28 0 0.0 12
28924 10.0 5 1.0 0 0 1 0 0 6867.0 35 7044.0 398.0 1240.0 36 0 1.0 16
In [69]:
kernix_check.head()
Out[69]:
LoanNr_ChkDgt Name ApprovalFY State default prob Grade ChgOffPrinGr SBA_ratio SBA_Appv Bank_loss SBA_claim SBA_ratio_ml
104760 7461474001 RELIANCE DATA INC 2004 VA 0 0.124421 3 0.0 0.50 50000.0 0.0 0.0 0.6
24578 6117344004 GILSON'S AUTO BODY SHOP 2003 NM 0 0.121012 3 0.0 0.50 25000.0 0.0 0.0 0.6
124772 7824464009 ALL ABOUT KIDS INC 2004 ND 0 0.084028 2 0.0 0.50 12500.0 0.0 0.0 0.8
20754 6042984002 HCD International, Inc. 2003 MD 0 0.210395 4 0.0 0.50 50000.0 0.0 0.0 0.4
28924 6184124008 PICKERINGTON EYE CARE, INC. 2003 OH 0 0.131635 3 0.0 0.85 68000.0 0.0 0.0 0.6
In [83]:
Test[Test.default == 1].sample(10).T
Out[83]:
85252 30159 38864 39227 106939 50215 66701 111219 3762 5490
LoanNr_ChkDgt 7176914003 6201084001 6321764009 6327264002 7492124004 6513334003 6818344007 7555594003 1050585004 1071345001
Name NORTHEAST WINDOWS INC DULCINEA CARIBBEAN STYLE INC PINAULT HARDWARE COMPANY UNCLE LARRY'S TOY SHOP BABY LAND VEGAS WHOLESALE JEWELER B & I AUTO COLLISION LLC Charles D. Byers DBA Mission C Chuy's Campbell L.L.C.
City WOODRIDGE DALLAS MIAMI WOONSOCKET CHARLOTTESVILLE OKLAHOMA CITY LAS VEGAS REDFORD TOWNSHIP San Antonio TUCSON
State NY TX FL RI VA OK NV MI TX AZ
Zip5d 12789 75215 33172 2895 22903 73132 89103 48239 78233 85719
Zip3d 127 752 331 28 229 731 891 482 782 857
Bank HSBC BK USA NATL ASSOC JPMORGAN CHASE BANK NATL ASSOC BANK OF AMERICA NATL ASSOC CITIZENS BANK NATL ASSOC BBCN BANK BANCFIRST BANK OF AMERICA NATL ASSOC PNC BANK, NATIONAL ASSOCIATION CAPITAL ONE NATL ASSOC BANK OF AMERICA NATL ASSOC
BankState VA IL NC RI CA OK NC DE VA NC
NAICS 423310 446120 811212 444130 451120 448130 421940 811121 339999 722110
NAICS_group 42 44 81 44 44 44 42 81 31 72
NAICS_default_rate 19 22 20 22 23 22 19 20 14 22
ApprovalDate 2004-02-11 2003-03-26 2003-05-15 2003-05-16 2004-06-03 2003-07-22 2003-10-21 2004-06-24 2004-08-31 2004-09-28
ApprovalFY 2004 2003 2003 2003 2004 2003 2004 2004 2004 2004
Term 46 40 58 6 47 41 64 168 37 82
NoEmp 6 5 12 5 1 4 1 2 7 16
NewExist 2 1 1 1 1 2 1 2 1 2
CreateJob 0 4 2 0 0 0 0 3 0 0
RetainedJob 6 1 12 5 1 0 1 2 0 0
FranchiseCode 1 1 1 1 1 1 1 1 1 1
UrbanRural 1 1 1 2 1 1 1 1 1 1
RevLineCr 1 0 1 1 0 0 1 0 0 0
LowDoc 0 0 0 0 0 0 0 0 0 0
ChgOffDate 26-Nov-10 22-Mar-07 24-Oct-05 1-Feb-11 2-Jan-08 30-Aug-06 27-Jul-05 25-Apr-14 11-Dec-08 27-May-08
DisbursementDate 2005-10-31 2003-04-30 2003-07-31 2003-07-31 2004-06-30 2003-08-31 2003-10-31 2004-09-30 2004-10-31 2005-01-31
DisbursementGross 166666 51000 50000 43113 5000 134500 50000 255000 35000 100000
BalanceGross 0 0 0 0 0 0 0 0 0 0
MIS_Status CHGOFF CHGOFF CHGOFF CHGOFF CHGOFF CHGOFF CHGOFF CHGOFF CHGOFF CHGOFF
ChgOffPrinGr 66666 37116 50000 14069 4010 83436 50000 219127 22565 85614
GrAppv 100000 51000 50000 30000 5000 134500 50000 255000 35000 100000
SBA_Appv 50000 43350 25000 15000 4250 114325 25000 127500 17500 50000
default 1 1 1 1 1 1 1 1 1 1
RealEstate 0 0 0 0 0 0 0 0 0 0
SBA_ratio 0.5 0.85 0.5 0.5 0.85 0.85 0.5 0.5 0.5 0.5
In [209]:
!ls ../large_data_files/ASA_loan_data/
SBAcase_11_13_17.csv SBAnational.csv      SBAnational_new.csv